Team member: Yunjia Ma, Hang Zou, Meiyi Wang, Jingyi Wu, Grace Wang
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import statsmodels.api as sm
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from sklearn import svm
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn import tree, linear_model
from sklearn.metrics.pairwise import linear_kernel, sigmoid_kernel, cosine_similarity
%matplotlib inline
/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:7: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. from pandas import (to_datetime, Int64Index, DatetimeIndex, Period, /Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/statsmodels/tsa/base/tsa_model.py:7: FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
# Processing meta data
data = []
with open('meta_Luxury_Beauty.json', 'r') as f:
for l in tqdm(f):
data.append(json.loads(l))
/var/folders/fz/6zhffn0d72s0mwxj9rgr2c700000gn/T/ipykernel_40221/724658956.py:4: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook` for l in tqdm(f):
0it [00:00, ?it/s]
df_title = pd.DataFrame(columns = ['asin','title'])
for i in range(len(data)):
df_title.loc[i,'asin'] = data[i]['asin']
df_title.loc[i,'title'] = data[i]['title']
len(df_title)
12299
# Read the data frame and drop non-related columns
df = pd.read_csv("Luxury_Beauty.csv",low_memory=False)
df.columns
Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style/Size:', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote', 'style/Flavor Name:', 'style/Color:', 'image/0', 'image/1', 'image/2', 'image/3', 'image/4', 'image/5', 'image/6', 'image/7', 'image/8', 'style/Format:', 'style/Style Name:', 'style/Style:', 'style/Scent:', 'style/Package Quantity:', 'style/Flavor:', 'style/Package Type:', 'style/Scent Name:'], dtype='object')
df = df.drop(columns = ['style/Size:','vote','style/Flavor Name:', 'style/Color:',
'image/0', 'image/1', 'image/2', 'image/3', 'image/4', 'image/5',
'image/6', 'image/7', 'image/8', 'style/Format:', 'style/Style Name:',
'style/Style:', 'style/Scent:', 'style/Package Quantity:',
'style/Flavor:', 'style/Package Type:', 'style/Scent Name:'])
df.head(3)
overall | verified | reviewTime | reviewerID | asin | reviewerName | reviewText | summary | unixReviewTime | |
---|---|---|---|---|---|---|---|---|---|
0 | 5 | True | 01 5, 2018 | A2HOI48JK8838M | B00004U9V2 | DB | This handcream has a beautiful fragrance. It d... | Beautiful Fragrance | 1515110400 |
1 | 5 | True | 04 5, 2017 | A1YIPEY7HX73S7 | B00004U9V2 | Ajaey | wonderful hand lotion, for seriously dry skin,... | wonderful hand lotion | 1491350400 |
2 | 5 | True | 03 27, 2017 | A2QCGHIJ2TCLVP | B00004U9V2 | D. Jones | Best hand cream around. Silky, thick, soaks i... | Best hand cream around | 1490572800 |
len(df)
34278
# len(df[df.verified == False]) #16517
# 这里有个column verified不清楚什么意思,但是verified为false的row很多
# Merge two dataframes with the same asin
beauty = pd.merge(df, df_title, on="asin", how="left")
beauty.head(3)
overall | verified | reviewTime | reviewerID | asin | reviewerName | reviewText | summary | unixReviewTime | title | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 5 | True | 01 5, 2018 | A2HOI48JK8838M | B00004U9V2 | DB | This handcream has a beautiful fragrance. It d... | Beautiful Fragrance | 1515110400 | Crabtree & Evelyn - Gardener's Ultra-Moist... |
1 | 5 | True | 01 5, 2018 | A2HOI48JK8838M | B00004U9V2 | DB | This handcream has a beautiful fragrance. It d... | Beautiful Fragrance | 1515110400 | Crabtree & Evelyn - Gardener's Ultra-Moist... |
2 | 5 | True | 04 5, 2017 | A1YIPEY7HX73S7 | B00004U9V2 | Ajaey | wonderful hand lotion, for seriously dry skin,... | wonderful hand lotion | 1491350400 | Crabtree & Evelyn - Gardener's Ultra-Moist... |
print('Dataset size: {:,} words'.format(len(beauty)))
Dataset size: 35,858 words
# Reformat datetime from raw form
beauty['reviewTime'] = pd.to_datetime(beauty['reviewTime'])
# Rearrange the left-to-right by relevance
beauty = beauty[['asin', 'title', 'summary', 'reviewText', 'overall', 'reviewerID', 'reviewerName',
'reviewTime', 'unixReviewTime']]
beauty.head(3)
asin | title | summary | reviewText | overall | reviewerID | reviewerName | reviewTime | unixReviewTime | |
---|---|---|---|---|---|---|---|---|---|
0 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Beautiful Fragrance | This handcream has a beautiful fragrance. It d... | 5 | A2HOI48JK8838M | DB | 2018-01-05 | 1515110400 |
1 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Beautiful Fragrance | This handcream has a beautiful fragrance. It d... | 5 | A2HOI48JK8838M | DB | 2018-01-05 | 1515110400 |
2 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | wonderful hand lotion | wonderful hand lotion, for seriously dry skin,... | 5 | A1YIPEY7HX73S7 | Ajaey | 2017-04-05 | 1491350400 |
beauty['year']=beauty['reviewTime'].dt.year
beauty['month']=beauty['reviewTime'].dt.month
beauty.head(3)
asin | title | summary | reviewText | overall | reviewerID | reviewerName | reviewTime | unixReviewTime | year | month | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Beautiful Fragrance | This handcream has a beautiful fragrance. It d... | 5 | A2HOI48JK8838M | DB | 2018-01-05 | 1515110400 | 2018 | 1 |
1 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Beautiful Fragrance | This handcream has a beautiful fragrance. It d... | 5 | A2HOI48JK8838M | DB | 2018-01-05 | 1515110400 | 2018 | 1 |
2 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | wonderful hand lotion | wonderful hand lotion, for seriously dry skin,... | 5 | A1YIPEY7HX73S7 | Ajaey | 2017-04-05 | 1491350400 | 2017 | 4 |
beauty.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 35858 entries, 0 to 35857 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 asin 35858 non-null object 1 title 35808 non-null object 2 summary 35841 non-null object 3 reviewText 35845 non-null object 4 overall 35858 non-null int64 5 reviewerID 35858 non-null object 6 reviewerName 35858 non-null object 7 reviewTime 35858 non-null datetime64[ns] 8 unixReviewTime 35858 non-null int64 9 year 35858 non-null int64 10 month 35858 non-null int64 dtypes: datetime64[ns](1), int64(4), object(6) memory usage: 3.3+ MB
17 rows in the dataset do not have summay and 13 rows do not have reviewText. So we will remove rows do not have reviewText information.
beauty = beauty.dropna(subset=['reviewText'])
Now there are 35,845 product asin in our data frame, but not all asin has corresponding title. So we furthered review these asins and found titles for them.
beauty.asin[pd.isna(beauty.title)].unique()
array(['B0015ZC1FG', 'B0015Z90AA', 'B00172IEVM', 'B001C0W8QG'], dtype=object)
#beauty.title[beauty.asin == 'B0015ZC1FG'] = 'Mario Badescu Mario Badescu Mask 2 Oz'
#beauty.title[beauty.asin == 'B0015Z90AA'] = 'Mario Badescu Collagen Moisturizer SPF 15 for Combination & Sensitive Skin| Daytime Face Cream with Collagen & Cottonseed Oil | Softens the Look of Dry Lines | 2 Fl Oz'
#beauty.title[beauty.asin == 'B00172IEVM'] = 'Mario Badescu Chamomile Shampoo'
#beauty.title[beauty.asin == 'B001C0W8QG'] = 'La Roche-Posay Respectissime Liquid Eyeliner, 0.04 Fl oz'
beauty.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 35845 entries, 0 to 35857 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 asin 35845 non-null object 1 title 35795 non-null object 2 summary 35828 non-null object 3 reviewText 35845 non-null object 4 overall 35845 non-null int64 5 reviewerID 35845 non-null object 6 reviewerName 35845 non-null object 7 reviewTime 35845 non-null datetime64[ns] 8 unixReviewTime 35845 non-null int64 9 year 35845 non-null int64 10 month 35845 non-null int64 dtypes: datetime64[ns](1), int64(4), object(6) memory usage: 3.3+ MB
len(beauty['asin'].unique())
1581
Number of Unique Products in the Luxury Beauty Category = 1581
sum(beauty.duplicated())
5772
We found there are duplicated reviews. One person reviewed the same product for multiple times with the same text. This can mislead our analysis. So next, we will remove the duplicated reviews.
beauty = beauty.drop_duplicates()
beauty = beauty.reset_index(drop = True)
beauty['asin'].value_counts().head(20)
B003OGV7UO 694 B004N2S2JM 694 B0006PLMFQ 462 B000J4FGAG 459 B0058TE4WI 458 B007PORYUI 458 B00B59AULY 458 B00699JDKY 457 B00BXS9PFE 439 B00DTH63P2 439 B00014GT8W 307 B002K6AHQY 263 B00H2VO6P0 250 B0013U0EYI 242 B0002ZW5UQ 242 B00M0V39VE 235 B002HG7NX2 174 B000NG80GM 155 B000142FVW 152 B00J66M2SM 152 Name: asin, dtype: int64
Most reviewed products are B003OGV7UO and B004N2S2JM, which have 694 reviews.
print(beauty.title[beauty.asin == 'B003OGV7UO'].unique())
print(beauty.title[beauty.asin == 'B004N2S2JM'].unique())
['Creative Nail Design Shellac UV Color Coat, 25 Ounce'] ['Creative Nail Design Shellac UV Color Coat, 25 Ounce']
These two products are coming from the same brand - CND with different colors.
f, axes = plt.subplots(2,2, figsize=(14,11))
yearly = beauty.groupby(['year'])['reviewerID'].count().reset_index()
yearly = yearly.rename(columns={'reviewerID':'no_of_reviews'})
yearChart = sns.lineplot(x='year',y='no_of_reviews',data=yearly, ax = axes[0,0])
yearChart.set_title('No. of reviews over years')
monthly = beauty.groupby(['month'])['reviewerID'].count().reset_index()
monthly = monthly.rename(columns={'reviewerID':'no_of_reviews'})
monthChart = sns.barplot(x='month',y='no_of_reviews',data=monthly, ax = axes[0,1])
monthChart.set_title('No. of reviews over month')
monthChart.set_xticklabels(monthChart.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
sns.countplot(x = 'overall', data = beauty, ax = axes[1,0] ).set_title('Overall Reviews')
beauty_reviews = beauty.loc[:,['asin','reviewerID','reviewerName','reviewText','summary','overall']]
beauty_reviews['reviewLength'] = beauty_reviews['reviewText'].apply(lambda x: len(x.split()))
reviews_word_length = beauty_reviews.groupby(pd.cut(beauty_reviews.reviewLength, np.arange(0,1000,100))).count()
reviews_word_length = reviews_word_length.rename(columns={'reviewLength':'count'})
reviews_word_length = reviews_word_length.reset_index()
#print(reviews_word_length)
reviewLengthChart = sns.barplot(x='reviewLength',y='count',data=reviews_word_length, ax = axes[1,1])
reviewLengthChart.set_title('Distribution of Reviews by word length')
reviewLengthChart.set_xticklabels(reviewLengthChart.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
f.tight_layout()
Majority of examples were rated highly (>=4).
from wordcloud import WordCloud
text = " ".join(word for word in beauty['reviewText'].astype(str))
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
wordcloud.generate(text)
wordcloud.to_image()
import re
from sklearn import feature_extraction
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
def preprocess(text):
text = text.lower() #lowercase
text = re.sub(r'[^\w\s]', '', text) #remove punctuations
text = re.sub(r'\d+', '', text) #remove numbers
text = " ".join(text.split()) #stripWhitespace
text = text.split()
text = [x for x in text if x not in stop_words] #remove stopwords
#text = [x for x in text if x not in ['product']] #remove task specific stopwords
text = " ".join(text)
# stemmer_ps = PorterStemmer()
# text = [stemmer_ps.stem(word) for word in text.split()] #stemming
# text = " ".join(text)
# lemmatizer = WordNetLemmatizer()
# text = [lemmatizer.lemmatize(word) for word in text.split()] #lemmatization
# text = " ".join(text)
return(text)
from sklearn import feature_extraction
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
beauty['review_processed'] = beauty['reviewText'].apply(lambda x:preprocess(str(x)))
beauty['review_processed'] = beauty['review_processed'].apply(lambda x:x.split())
beauty.head(3)
asin | title | summary | reviewText | overall | reviewerID | reviewerName | reviewTime | unixReviewTime | year | month | review_processed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Beautiful Fragrance | This handcream has a beautiful fragrance. It d... | 5 | A2HOI48JK8838M | DB | 2018-01-05 | 1515110400 | 2018 | 1 | [handcream, beautiful, fragrance, doesnt, stay... |
1 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | wonderful hand lotion | wonderful hand lotion, for seriously dry skin,... | 5 | A1YIPEY7HX73S7 | Ajaey | 2017-04-05 | 1491350400 | 2017 | 4 | [wonderful, hand, lotion, seriously, dry, skin... |
2 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Best hand cream around | Best hand cream around. Silky, thick, soaks i... | 5 | A2QCGHIJ2TCLVP | D. Jones | 2017-03-27 | 1490572800 | 2017 | 3 | [best, hand, cream, silky, soaks, way, leaving... |
beauty['pos_neg'] = [1 if x > 3 else 0 for x in beauty.overall]
beauty.head(3)
asin | title | summary | reviewText | overall | reviewerID | reviewerName | reviewTime | unixReviewTime | year | month | review_processed | pos_neg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Beautiful Fragrance | This handcream has a beautiful fragrance. It d... | 5 | A2HOI48JK8838M | DB | 2018-01-05 | 1515110400 | 2018 | 1 | [handcream, beautiful, fragrance, doesnt, stay... | 1 |
1 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | wonderful hand lotion | wonderful hand lotion, for seriously dry skin,... | 5 | A1YIPEY7HX73S7 | Ajaey | 2017-04-05 | 1491350400 | 2017 | 4 | [wonderful, hand, lotion, seriously, dry, skin... | 1 |
2 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Best hand cream around | Best hand cream around. Silky, thick, soaks i... | 5 | A2QCGHIJ2TCLVP | D. Jones | 2017-03-27 | 1490572800 | 2017 | 3 | [best, hand, cream, silky, soaks, way, leaving... | 1 |
x_train, x_test, y_train, y_test = train_test_split(beauty.reviewText, beauty.pos_neg, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
(22554,) (7519,) (22554,) (7519,)
alltext= x_train.tolist()+ x_test.tolist()
vectorizer = CountVectorizer(min_df=5).fit(x_train)
X_train = vectorizer.transform(x_train)
print("X_train:\n{}".format(repr(X_train)))
X_train: <22554x8773 sparse matrix of type '<class 'numpy.int64'>' with 1248545 stored elements in Compressed Sparse Row format>
feature_names = vectorizer.get_feature_names_out()
print("Number of features: {}".format(len(feature_names)))
Number of features: 8773
logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
X_test = vectorizer.transform(x_test)
log_y_pred = logreg.predict(X_test)
logreg_score = accuracy_score(y_test, log_y_pred)
print("Accuracy: {:.3f}".format(logreg_score))
/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
Accuracy: 0.869
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))
Training set score: 0.907 Test set score: 0.869
log_cfm = confusion_matrix(y_test, log_y_pred)
print("Confusion matrix:")
print(log_cfm, end='\n\n')
print('-'*15)
print(np.array([['TN', 'FP'],[ 'FN' , 'TP']]))
Confusion matrix: [[ 744 720] [ 268 5787]] --------------- [['TN' 'FP'] ['FN' 'TP']]
plt.imshow(log_cfm, interpolation='nearest')
for i, j in itertools.product(range(log_cfm.shape[0]), range(log_cfm.shape[1])):
plt.text(j, i, log_cfm[i, j],
horizontalalignment="center",
color="white")
plt.ylabel('True label (Recall)')
plt.xlabel('Predicted label (Precision)')
plt.title('Logistic Reg | Confusion Matrix')
plt.colorbar();
log_f1 = f1_score(y_test, log_y_pred)
print("Logistic Reg - F1 score: {:.3f}".format(log_f1))
Logistic Reg - F1 score: 0.921
mnb = MultinomialNB(alpha=.01)
mnb.fit(X_train, y_train)
MultinomialNB(alpha=0.01)
mnb_y_pred = mnb.predict(X_test)
mnb_score = accuracy_score(y_test, mnb_y_pred)
print("Accuracy: {:.3f}".format(mnb_score))
Accuracy: 0.856
print("Training set score: {:.3f}".format(mnb.score(X_train, y_train)))
print("Test set score: {:.3f}".format(mnb.score(X_test, y_test)))
Training set score: 0.895 Test set score: 0.856
mnb_cfm = confusion_matrix(y_test, mnb_y_pred)
print("Confusion matrix:")
print(mnb_cfm, end='\n\n')
print('-'*15)
print(np.array([['TN', 'FP'],[ 'FN' , 'TP']]))
Confusion matrix: [[ 856 608] [ 476 5579]] --------------- [['TN' 'FP'] ['FN' 'TP']]
plt.imshow(mnb_cfm, interpolation='nearest')
for i, j in itertools.product(range(mnb_cfm.shape[0]), range(mnb_cfm.shape[1])):
plt.text(j, i, mnb_cfm[i, j],
horizontalalignment="center",
color="white")
plt.ylabel('True label (Recall)')
plt.xlabel('Predicted label (Precision)')
plt.title('Multinomial | Confusion Matrix')
plt.colorbar();
mnb_f1 = f1_score(y_test, mnb_y_pred)
print("Multinomial NB - F1 score: {:.3f}".format(mnb_f1))
Multinomial NB - F1 score: 0.911
mnb_tfidfvectorizer = TfidfVectorizer(min_df=5).fit(x_train)
mnb_X_train = mnb_tfidfvectorizer.transform(x_train)
print("X_train:\n{}".format(repr(mnb_X_train)))
X_train: <22554x8773 sparse matrix of type '<class 'numpy.float64'>' with 1248545 stored elements in Compressed Sparse Row format>
mnb_X_test = mnb_tfidfvectorizer.transform(x_test)
mnb_y_pred = mnb.predict(mnb_X_test)
mnb_score2 = accuracy_score(y_test, mnb_y_pred)
print("Accuracy: {:.3f}".format(mnb_score2))
Accuracy: 0.820
print("Training set score: {:.3f}".format(mnb.score(mnb_X_train, y_train)))
print("Test set score: {:.3f}".format(mnb.score(mnb_X_test, y_test)))
Training set score: 0.835 Test set score: 0.820
mnb_cfm2 = confusion_matrix(y_test, mnb_y_pred)
print("Confusion matrix:")
print(mnb_cfm2, end='\n\n')
print('-'*15)
print(np.array([['TN', 'FP'],[ 'FN' , 'TP']]))
Confusion matrix: [[ 145 1319] [ 31 6024]] --------------- [['TN' 'FP'] ['FN' 'TP']]
plt.imshow(mnb_cfm2, interpolation='nearest')
for i, j in itertools.product(range(mnb_cfm2.shape[0]), range(mnb_cfm2.shape[1])):
plt.text(j, i, mnb_cfm2[i, j],
horizontalalignment="center",
color="white")
plt.ylabel('True label (Recall)')
plt.xlabel('Predicted label (Precision)')
plt.title('Multinomial | Confusion Matrix')
plt.colorbar();
mnb2_f1 = f1_score(y_test, mnb_y_pred)
print("Multinomial NB - F1 score: {:.3f}".format(mnb2_f1))
Multinomial NB - F1 score: 0.899
beauty.head(3)
asin | title | summary | reviewText | overall | reviewerID | reviewerName | reviewTime | unixReviewTime | review_processed | pos_neg | year | month | pos_neg_predictLogit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Beautiful Fragrance | This handcream has a beautiful fragrance. It d... | 5 | A2HOI48JK8838M | DB | 2018-01-05 | 1515110400 | [handcream, beautiful, fragrance, doesnt, stay... | 1 | 2018 | 1 | 1 |
1 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | wonderful hand lotion | wonderful hand lotion, for seriously dry skin,... | 5 | A1YIPEY7HX73S7 | Ajaey | 2017-04-05 | 1491350400 | [wonderful, hand, lotion, seriously, dry, skin... | 1 | 2017 | 4 | 1 |
2 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Best hand cream around | Best hand cream around. Silky, thick, soaks i... | 5 | A2QCGHIJ2TCLVP | D. Jones | 2017-03-27 | 1490572800 | [best, hand, cream, silky, soaks, way, leaving... | 1 | 2017 | 3 | 1 |
review_text = beauty['reviewText'].apply(lambda x:preprocess(x))
# Use tf-idf features
from sklearn import feature_extraction
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf = tfidf_vectorizer.fit_transform(review_text)
# Use tf features
tf_vectorizer = CountVectorizer(stop_words=stop_words)
tf = tf_vectorizer.fit_transform(review_text)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print("Number of total features: {}".format(len(tfidf_feature_names)))
Number of total features: 33037
# Initialize NMF
nmf = NMF(n_components=10, random_state=1,
alpha=.1, l1_ratio=.5)
# Initialize Ida
lda = LatentDirichletAllocation(n_components=10, max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0)
num_top_words = 15
def retrieve_top_words(model, feature_names, num_top_words):
for idx, topic in enumerate(model.components_):
print("Topic #{}:".format(idx), end='\n')
print(" ".join([feature_names[i]
for i in topic.argsort()[:-num_top_words - 1:-1]]), end='\n\n')
print()
nmf_tf = nmf.fit(tf)
/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:1422: FutureWarning: `alpha` was deprecated in version 1.0 and will be removed in 1.2. Use `alpha_W` and `alpha_H` instead warnings.warn( /Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:289: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26). warnings.warn( /Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:1637: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence. warnings.warn(
nmf_ = nmf_tf.transform(tf)
Counter([np.argmax(i) for i in nmf_])
/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:1422: FutureWarning: `alpha` was deprecated in version 1.0 and will be removed in 1.2. Use `alpha_W` and `alpha_H` instead warnings.warn(
Counter({9: 3063, 7: 1537, 0: 3836, 4: 3396, 6: 4002, 3: 2903, 5: 7413, 8: 1090, 1: 390, 2: 2443})
retrieve_top_words(nmf_tf, tfidf_feature_names, num_top_words)
Topic #0: skin dry sensitive products oil serum moisturizer oily feels feel does foundation care ingredients using Topic #1: dermablend loreal professional spf cover look powder products nice makeup setting product skin light medium Topic #2: hair use dry shampoo iron conditioner using just used products long great oil heat hold Topic #3: like really dont just im feel doesnt look little ive does didnt think good smell Topic #4: product products used price good great does using did apply ingredients try little im results Topic #5: color polish love great colors nail coat nails looks look pink coats just nice essie Topic #6: use face using just used time dont makeup ive im day really little results did Topic #7: cream shave shaving eye products night creams hand using eyes used razor good does hands Topic #8: brush foundation makeup powder brushes shaving use bristles coverage shave clarisonic soap good handle blush Topic #9: scent smell fragrance bottle nice love perfume strong light body good lotion smells floral does
lda_tf = lda.fit(tf)
lda_ = lda_tf.transform(tf)
Counter([np.argmax(i) for i in lda_])
Counter({8: 14554, 7: 3987, 6: 6758, 4: 4132, 5: 167, 3: 54, 0: 102, 2: 191, 9: 93, 1: 35})
retrieve_top_words(lda_tf, tfidf_feature_names, num_top_words)
Topic #0: neon candle basic replacement seriously ok wood orange dissipates zero delivery juicy spice josie package Topic #1: iron feet curling tanning hot nuface curl cord heat mitt tanner barrel irons curler xentan Topic #2: perfume fragrance device wife like classalinknormal datahookproductlinklinked cologne loves likes wear floral bristles scents unique Topic #3: lashes dryer japonesque mud button mario tizo features eyelashes badescu settings tobacco tonic setting temperature Topic #4: color polish love nail nails coat essie coats beautiful colors pretty like pink time bottle Topic #5: brush shave shaving soap razor cream handle lather beard proraso black kit clippers thank close Topic #6: color great like foundation look makeup nice powder product really light coverage dark good apply Topic #7: hair scent like smell nice great smells spray just bottle love fragrance body pleasant strong Topic #8: skin product use face like using really cream used products just does im good dont Topic #9: ingredients oil acid extract list butter contains alcohol shea organic loccitane seed sodium fragrance active
nmf_tfidf = nmf.fit(tfidf)
/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:1422: FutureWarning: `alpha` was deprecated in version 1.0 and will be removed in 1.2. Use `alpha_W` and `alpha_H` instead warnings.warn( /Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:289: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26). warnings.warn( /Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:1637: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence. warnings.warn(
nmf_W = nmf_tfidf.transform(tfidf)
Counter([np.argmax(i) for i in nmf_W])
/Users/jyiwu/opt/anaconda3/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:1422: FutureWarning: `alpha` was deprecated in version 1.0 and will be removed in 1.2. Use `alpha_W` and `alpha_H` instead warnings.warn(
Counter({9: 9065, 0: 6608, 2: 1795, 8: 1233, 3: 1723, 1: 2345, 7: 1310, 5: 1100, 6: 2324, 4: 2570})
retrieve_top_words(nmf_tfidf, tfidf_feature_names, num_top_words)
Topic #0: skin face cream dry use using sensitive products moisturizer feels feel cleanser feeling oily soft Topic #1: color beautiful pretty summer perfect coats pink favorite neutral bright essie looks fall picture awesome Topic #2: great works smells stuff summer price shipping color brand neutral fast looks item quality seller Topic #3: love products color stuff cnd colors brand essie absolutely amazing compliments shade smell shellac new Topic #4: hair shampoo conditioner use iron dry spray fine hold curls makes soft used works scalp Topic #5: good price stuff smells coverage pretty quality job works feels does overall looks polish buy Topic #6: polish nail essie nails coat colors coats polishes favorite chips opi chip pink best fast Topic #7: product excellent years using used described recommend wonderful quality received use price products time results Topic #8: nice scent smells smell light color really feels bold little smooth fragrance different stays clean Topic #9: like really just little use foundation makeup dont scent look does long bit powder day
lda_tfidf = lda.fit(tfidf)
lda_W = lda_tfidf.transform(tfidf)
Counter([np.argmax(i) for i in lda_W])
Counter({6: 29037, 5: 91, 3: 39, 8: 718, 0: 49, 7: 19, 2: 27, 4: 38, 1: 42, 9: 13})
retrieve_top_words(lda_tfidf, tfidf_feature_names, num_top_words)
Topic #0: ok charcoal speedy ty lengthens responsive handcream topicals temptu winterlike bronco gulsha aa irrelevant wellso Topic #1: tizo extract glycol sodium seed buttery ci glycerin dimethicone cashmere leaf cinnamon phenoxyethanol precious alright Topic #2: elta md raspberry perfecto expectedits browngray mystery shown massages paints baroness overthetop rite bizarre untreated Topic #3: tobacco vibrant aaa sparkling excelente authenticity thigh inserts depicted doubtful valid granuals neiman offenders behold Topic #4: awesome gifting colore dusty coater pans opaqueness allinone channel midnight crystals nephew manis preserving daves Topic #5: described item purchase waste promptly exppected money def useful arrived attractively excelent nostrils advertising width Topic #6: skin color product like use hair great good love really nice face just does scent Topic #7: cute diamond dissipated leans vibration gardening choo social sisters reordered vampire represented lends overdoing whisper Topic #8: love great color product watered toothpaste michelle complaints unexpected looks spirit steady stains latte basecoatbut Topic #9: underarms eyeliners typehidden videoblockdivinput div classvideourlinput aspacingsmall aspacingtopmini classasection bathe recipient swatched rules crisco omg
tfidf.shape
(30073, 33037)
sig_kern = sigmoid_kernel(tfidf, tfidf)
sig_kern.shape
(30073, 30073)
sig_kern
array([[0.76160687, 0.76159416, 0.76159511, ..., 0.76159416, 0.76159416, 0.76159416], [0.76159416, 0.76160687, 0.76159545, ..., 0.76159416, 0.76159416, 0.76159416], [0.76159511, 0.76159545, 0.76160687, ..., 0.76159416, 0.76159416, 0.76159416], ..., [0.76159416, 0.76159416, 0.76159416, ..., 0.76160687, 0.76159416, 0.76159416], [0.76159416, 0.76159416, 0.76159416, ..., 0.76159416, 0.76160687, 0.76159455], [0.76159416, 0.76159416, 0.76159416, ..., 0.76159416, 0.76159455, 0.76160687]])
index = pd.Series(beauty.index, index=beauty['title']).drop_duplicates()
index
title Crabtree & Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ 0 Crabtree & Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ 1 Crabtree & Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ 2 Crabtree & Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ 3 Crabtree & Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ 4 ... TIZO Photoceutical AM Replenish SPF 40 Sunscreen Primer, 1 fl. oz. 30068 TIZO Photoceutical AM Replenish SPF 40 Sunscreen Primer, 1 fl. oz. 30069 ORIBE Bright Blonde Radiance and Repair Treatment, 4.2 fl. oz. 30070 ORIBE Bright Blonde Radiance and Repair Treatment, 4.2 fl. oz. 30071 ELEMIS Superfood Facial Oil - Nourishing Face Oil, 0.5 fl. oz. 30072 Length: 30073, dtype: int64
def recommend_beauty(name, sig_kern=sig_kern):
indx = index[name]
sigmoid_score = list(enumerate(sig_kern[indx]))
sigmoid_score = sorted(sigmoid_score, key = lambda x:x[1], reverse = True)
sigmoid_score = sigmoid_score[1:4]
position = [i[0] for i in sigmoid_score]
return beauty.iloc[position]
your_favorite_beauty_title = "Crabtree & Evelyn - Gardener's Ultra-Moisturising Hand Therapy Pump - 250g/8.8 OZ"
beauty[beauty.title == your_favorite_beauty_title]
asin | title | summary | reviewText | overall | reviewerID | reviewerName | reviewTime | unixReviewTime | review_processed | pos_neg | year | month | pos_neg_predictLogit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Beautiful Fragrance | This handcream has a beautiful fragrance. It d... | 5 | A2HOI48JK8838M | DB | 2018-01-05 | 1515110400 | [handcream, beautiful, fragrance, doesnt, stay... | 1 | 2018 | 1 | 1 |
1 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | wonderful hand lotion | wonderful hand lotion, for seriously dry skin,... | 5 | A1YIPEY7HX73S7 | Ajaey | 2017-04-05 | 1491350400 | [wonderful, hand, lotion, seriously, dry, skin... | 1 | 2017 | 4 | 1 |
2 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Best hand cream around | Best hand cream around. Silky, thick, soaks i... | 5 | A2QCGHIJ2TCLVP | D. Jones | 2017-03-27 | 1490572800 | [best, hand, cream, silky, soaks, way, leaving... | 1 | 2017 | 3 | 1 |
3 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Five Stars | Thanks!! | 5 | A2R4UNHFJBA6PY | Amazon Customer | 2017-03-20 | 1489968000 | [thanks] | 1 | 2017 | 3 | 1 |
4 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Great hand lotion! | Great hand lotion. Soaks right in and leaves ... | 5 | A2QCGHIJ2TCLVP | D. Jones | 2017-02-28 | 1488240000 | [great, hand, lotion, soaks, right, leaves, sk... | 1 | 2017 | 2 | 1 |
5 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Five Stars | Great product. Doesn't leave you hands feeling... | 5 | A1606LA683WZZU | Amr | 2017-02-25 | 1487980800 | [great, product, doesnt, leave, hands, feeling... | 1 | 2017 | 2 | 1 |
6 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Five Stars | Just as described. Arrived on time. | 5 | A1606LA683WZZU | Amr | 2017-01-30 | 1485734400 | [just, described, arrived, time] | 1 | 2017 | 1 | 1 |
7 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Smells good, absorbs quickly | Nice lightweight hand cream for the summer. | 4 | A1YY53NQXFKMRN | Trixie | 2017-01-24 | 1485216000 | [nice, lightweight, hand, cream, summer] | 1 | 2017 | 1 | 1 |
8 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Five Stars | Best hand cream ever. | 5 | A3R0NQ9E53JHYQ | T. Hooth | 2016-12-01 | 1480550400 | [best, hand, cream] | 1 | 2016 | 12 | 1 |
9 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | BEST hand cream ever | BEST hand cream ever. One at work, one in my ... | 5 | A3R0NQ9E53JHYQ | T. Hooth | 2016-12-01 | 1480550400 | [best, hand, cream, work, purse, nightstand, k... | 1 | 2016 | 12 | 1 |
10 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Great stuff! | 'Love this stuff. Very smooth, very moisturiz... | 5 | A2QCGHIJ2TCLVP | D. Jones | 2016-10-12 | 1476230400 | [love, stuff, smooth, moisturizing, got, gift,... | 1 | 2016 | 10 | 1 |
11 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Five Stars | I like the cream it's so beautiful | 5 | A35YXEDATMIJ9S | asma | 2016-09-11 | 1473552000 | [like, cream, beautiful] | 1 | 2016 | 9 | 1 |
12 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Paying for the name | There is no evidence to me that this product i... | 3 | A2HOI48JK8838M | DB | 2016-08-14 | 1471132800 | [evidence, product, improvement, similarly, pr... | 0 | 2016 | 8 | 1 |
13 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Great scent! | I LOVE this scent.... it's so refreshing and ... | 5 | A3CRJ1Q73RB25F | Annie T | 2016-03-20 | 1458432000 | [love, scent, refreshing, clean, yummy, dont, ... | 1 | 2016 | 3 | 1 |
14 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | The Best Hand Cream Ever! | The Best Hand Cream Ever, I love this product.... | 5 | A1Y39RECFXEGNL | Art | 2016-02-13 | 1455321600 | [best, hand, cream, love, product, greasy] | 1 | 2016 | 2 | 1 |
15 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | The Best Hand Cream Ever | Amazing ....my favorite hand cream ever | 5 | A1Y39RECFXEGNL | Art | 2016-02-13 | 1455321600 | [amazing, favorite, hand, cream] | 1 | 2016 | 2 | 1 |
16 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Favorite hand cream! | My sons favorite hand cream for his dry hands.... | 5 | A3O0BXK3SZ6FE0 | Newman | 2016-01-19 | 1453161600 | [sons, favorite, hand, cream, dry, hands, grea... | 1 | 2016 | 1 | 1 |
17 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | This hand cream is addicting and the scent is ... | This hand cream is addicting and the scent is ... | 5 | AWVF9EWW3BXYV | A_V | 2016-01-11 | 1452470400 | [hand, cream, addicting, scent, pleasant, does... | 1 | 2016 | 1 | 1 |
18 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Five Stars | I love this hand cream. I'm on my fifth tube. ... | 5 | AWVF9EWW3BXYV | A_V | 2015-11-23 | 1448236800 | [love, hand, cream, im, fifth, tube, love, lov... | 1 | 2015 | 11 | 1 |
19 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | The best hand cream ever. | The best hand cream ever. I'm constantly wash... | 4 | A1Y39RECFXEGNL | Art | 2014-07-06 | 1404604800 | [best, hand, cream, im, constantly, washing, h... | 1 | 2014 | 7 | 1 |
20 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | I like the moisturizing properties, but the sm... | Love the moisturizing properties of the lotion... | 3 | ACUGBCEADYT6D | Booklass | 2014-06-13 | 1402617600 | [love, moisturizing, properties, lotion, smell... | 0 | 2014 | 6 | 1 |
21 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | A favorite | My son used some of this hand cream when I had... | 5 | A3O0BXK3SZ6FE0 | Newman | 2014-05-05 | 1399248000 | [son, used, hand, cream, daily, user, fan, bou... | 1 | 2014 | 5 | 1 |
22 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | An effective hand cream with a wonderful scent | This hand cream has one of the nicest fragranc... | 5 | A3S3R88HA0HZG3 | PT Cruiser | 2012-12-31 | 1356912000 | [hand, cream, nicest, fragrances, ive, come, e... | 1 | 2012 | 12 | 1 |
recommend_beauty(your_favorite_beauty_title)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Input In [145], in <cell line: 1>() ----> 1 recommend_beauty(your_favorite_beauty_title) Input In [140], in recommend_beauty(name, sig_kern) 2 indx = index[name] 3 sigmoid_score = list(enumerate(sig_kern[indx])) ----> 4 sigmoid_score = sorted(sigmoid_score, key = lambda x:x[1], reverse = True) 5 sigmoid_score = sigmoid_score[1:4] 6 position = [i[0] for i in sigmoid_score] ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
beauty.head(3)
asin | title | summary | reviewText | overall | reviewerID | reviewerName | reviewTime | unixReviewTime | year | month | review_processed | pos_neg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Beautiful Fragrance | This handcream has a beautiful fragrance. It d... | 5 | A2HOI48JK8838M | DB | 2018-01-05 | 1515110400 | 2018 | 1 | [handcream, beautiful, fragrance, doesnt, stay... | 1 |
1 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | wonderful hand lotion | wonderful hand lotion, for seriously dry skin,... | 5 | A1YIPEY7HX73S7 | Ajaey | 2017-04-05 | 1491350400 | 2017 | 4 | [wonderful, hand, lotion, seriously, dry, skin... | 1 |
2 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Best hand cream around | Best hand cream around. Silky, thick, soaks i... | 5 | A2QCGHIJ2TCLVP | D. Jones | 2017-03-27 | 1490572800 | 2017 | 3 | [best, hand, cream, silky, soaks, way, leaving... | 1 |
all_nails = beauty.copy()
all_nails['title_processed'] = all_nails['title'].apply(lambda x:preprocess(str(x)))
all_nails['title_processed'] = all_nails['title_processed'].apply(lambda x:x.split())
all_nails.head(3)
asin | title | summary | reviewText | overall | reviewerID | reviewerName | reviewTime | unixReviewTime | year | month | review_processed | pos_neg | title_processed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Beautiful Fragrance | This handcream has a beautiful fragrance. It d... | 5 | A2HOI48JK8838M | DB | 2018-01-05 | 1515110400 | 2018 | 1 | [handcream, beautiful, fragrance, doesnt, stay... | 1 | [crabtree, amp, evelyn, gardeners, ultramoistu... |
1 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | wonderful hand lotion | wonderful hand lotion, for seriously dry skin,... | 5 | A1YIPEY7HX73S7 | Ajaey | 2017-04-05 | 1491350400 | 2017 | 4 | [wonderful, hand, lotion, seriously, dry, skin... | 1 | [crabtree, amp, evelyn, gardeners, ultramoistu... |
2 | B00004U9V2 | Crabtree & Evelyn - Gardener's Ultra-Moist... | Best hand cream around | Best hand cream around. Silky, thick, soaks i... | 5 | A2QCGHIJ2TCLVP | D. Jones | 2017-03-27 | 1490572800 | 2017 | 3 | [best, hand, cream, silky, soaks, way, leaving... | 1 | [crabtree, amp, evelyn, gardeners, ultramoistu... |
ls = []
for i in all_nails.index:
if "nail" in all_nails.loc[i,'title_processed']:
ls.append(i)
len(ls)
6809
all_nails = all_nails.iloc[ls,:]
all_nails = all_nails.reset_index(drop = True)
all_nails.head(3)
asin | title | summary | reviewText | overall | reviewerID | reviewerName | reviewTime | unixReviewTime | year | month | review_processed | pos_neg | title_processed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | B000142FVW | OPI Nail Lacquer, Not So Bora-Bora-ing Pink, 0... | Five Stars | Love this color and brand | 5 | AJ9VB45YRX8C9 | Amazon Customer | 2018-04-11 | 1523404800 | 2018 | 4 | [love, color, brand] | 1 | [opi, nail, lacquer, boraboraing, pink, fl, oz] |
1 | B000142FVW | OPI Nail Lacquer, Not So Bora-Bora-ing Pink, 0... | Gifted | Was bought for a gift. Recipient was thrilled. | 5 | A39ECS1S0CJ0X9 | Amy Z. | 2018-04-03 | 1522713600 | 2018 | 4 | [bought, gift, recipient, thrilled] | 1 | [opi, nail, lacquer, boraboraing, pink, fl, oz] |
2 | B000142FVW | OPI Nail Lacquer, Not So Bora-Bora-ing Pink, 0... | Five Stars | looks great | 5 | AI2YYV9D3LS8P | breezy deaton | 2018-03-18 | 1521331200 | 2018 | 3 | [looks, great] | 1 | [opi, nail, lacquer, boraboraing, pink, fl, oz] |
all_nails.overall.value_counts().sort_index(ascending = False)
5 4673 4 908 3 567 2 323 1 338 Name: overall, dtype: int64
all_nails.to_csv("all_nails.csv")
import re
from sklearn import feature_extraction
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
def preprocess(text):
text = text.lower() #lowercase
text = re.sub(r'[^\w\s]', '', text) #remove punctuations
text = re.sub(r'\d+', '', text) #remove numbers
text = " ".join(text.split()) #stripWhitespace
text = text.split()
text = [x for x in text if x not in stop_words] #remove stopwords
text = [x for x in text if x not in ["product",'loves','likes','color','just','essie','opi','cnd',
'nail','nails','polish','polishes','really','love','like','im']] #remove task specific stopwords
text = " ".join(text)
# stemmer_ps = PorterStemmer()
# text = [stemmer_ps.stem(word) for word in text.split()] #stemming
# text = " ".join(text)
# lemmatizer = WordNetLemmatizer()
# text = [lemmatizer.lemmatize(word) for word in text.split()] #lemmatization
# text = " ".join(text)
return(text)
all_nails['review_processed']=all_nails['reviewText'].apply(lambda x:preprocess(x))
all_nails['review_processed']=all_nails['review_processed'].apply(lambda x:x.split())
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
text = " ".join(word for word in all_nails.review_processed[(all_nails.overall == 4)|(all_nails.overall == 5)].astype(str))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(text)
# Visualize the word cloud
wordcloud.to_image()
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
text = " ".join(word for word in all_nails.review_processed[(all_nails.overall == 1)|(all_nails.overall == 2)].astype(str))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(text)
# Visualize the word cloud
wordcloud.to_image()
from gensim.models.word2vec import Word2Vec
model = Word2Vec(sentences=all_nails['review_processed'].tolist(),
size=100, sg=1,
min_count=5,window=10,
workers=-1,seed=10,iter=250)
vocab = model.wv.index2word
len(vocab)
1939
model.wv.most_similar('disappointed', topn=10)
[('imatation', 0.3182666301727295), ('lost', 0.30434471368789673), ('doesnt', 0.30145254731178284), ('effectively', 0.2999058663845062), ('noting', 0.2995549440383911), ('needs', 0.2948138117790222), ('clearly', 0.28421464562416077), ('shoot', 0.279448926448822), ('instead', 0.27057692408561707), ('cuticles', 0.26807481050491333)]
from gensim import corpora
dictionary = corpora.Dictionary(all_nails['review_processed'])
dictionaryDF = pd.DataFrame()
dictionaryDF['id']=dictionary.keys()
dictionaryDF['word']=dictionary.values()
dictionaryDF
id | word | |
---|---|---|
0 | 0 | brand |
1 | 1 | bought |
2 | 2 | gift |
3 | 3 | recipient |
4 | 4 | thrilled |
... | ... | ... |
5077 | 5077 | germany |
5078 | 5078 | grooves |
5079 | 5079 | maraschino |
5080 | 5080 | traveling |
5081 | 5081 | wider |
5082 rows × 2 columns
all_nails['review_ids']=all_nails['review_processed'].apply(lambda x:dictionary.doc2bow(x))
from gensim import models
num_topics=7
ldamodel = models.ldamodel.LdaModel(all_nails.review_ids[(all_nails.overall == 4)|(all_nails.overall == 5)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
print(topic)
(0, '0.025*"perfect" + 0.024*"beautiful" + 0.015*"great" + 0.015*"goes" + 0.015*"colors" + 0.014*"time"') (1, '0.041*"great" + 0.017*"coat" + 0.011*"looks" + 0.009*"coats" + 0.009*"bottle" + 0.009*"used"') (2, '0.027*"great" + 0.018*"nice" + 0.015*"beautiful" + 0.015*"perfect" + 0.013*"fast" + 0.013*"looks"') (3, '0.019*"coats" + 0.018*"pretty" + 0.012*"shade" + 0.011*"perfect" + 0.011*"little" + 0.011*"good"') (4, '0.017*"pink" + 0.016*"coat" + 0.015*"dark" + 0.015*"colors" + 0.015*"looks" + 0.012*"great"') (5, '0.044*"great" + 0.023*"good" + 0.020*"pink" + 0.015*"summer" + 0.013*"bright" + 0.010*"time"') (6, '0.028*"pretty" + 0.017*"nice" + 0.015*"coats" + 0.013*"use" + 0.013*"look" + 0.011*"great"')
from gensim import models
num_topics=7
ldamodel = models.ldamodel.LdaModel(all_nails.review_ids[(all_nails.overall == 1)|(all_nails.overall == 2)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
print(topic)
(0, '0.020*"coats" + 0.012*"coat" + 0.011*"disappointed" + 0.011*"quality" + 0.010*"looks" + 0.010*"time"') (1, '0.013*"colors" + 0.013*"coat" + 0.012*"chips" + 0.012*"bought" + 0.012*"great" + 0.009*"better"') (2, '0.011*"coat" + 0.010*"picture" + 0.009*"pink" + 0.008*"nice" + 0.008*"u" + 0.008*"chips"') (3, '0.021*"disappointed" + 0.019*"coats" + 0.010*"wont" + 0.009*"time" + 0.009*"does" + 0.008*"pink"') (4, '0.018*"pink" + 0.011*"coat" + 0.010*"looking" + 0.009*"disappointment" + 0.009*"money" + 0.008*"coats"') (5, '0.013*"dont" + 0.012*"doesnt" + 0.011*"looks" + 0.008*"coats" + 0.007*"use" + 0.007*"green"') (6, '0.019*"coats" + 0.012*"use" + 0.009*"quality" + 0.009*"better" + 0.008*"purchased" + 0.008*"pretty"')
essie = all_nails.copy()
ls = []
for i in essie.index:
if "essie" in essie.loc[i,'title_processed']:
ls.append(i)
essie = essie.iloc[ls,:]
essie = essie.reset_index(drop = True)
len(essie)
3641
essie.overall.value_counts().sort_index(ascending = False)
5 2384 4 508 3 320 2 206 1 223 Name: overall, dtype: int64
import re
from sklearn import feature_extraction
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
def preprocess(text):
text = text.lower() #lowercase
text = re.sub(r'[^\w\s]', '', text) #remove punctuations
text = re.sub(r'\d+', '', text) #remove numbers
text = " ".join(text.split()) #stripWhitespace
text = text.split()
text = [x for x in text if x not in stop_words] #remove stopwords
text = [x for x in text if x not in ["product",'loves','likes','color','just','essie','opi','cnd',
'nail','nails','polish','polishes','really','love','like','im']] #remove task specific stopwords
text = " ".join(text)
# stemmer_ps = PorterStemmer()
# text = [stemmer_ps.stem(word) for word in text.split()] #stemming
# text = " ".join(text)
# lemmatizer = WordNetLemmatizer()
# text = [lemmatizer.lemmatize(word) for word in text.split()] #lemmatization
# text = " ".join(text)
return(text)
essie['review_processed']=essie['reviewText'].apply(lambda x:preprocess(x))
essie['review_processed']=essie['review_processed'].apply(lambda x:x.split())
print(essie.reviewText.unique())
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [1], in <cell line: 1>() ----> 1 print(essie.reviewText.unique()) NameError: name 'essie' is not defined
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
text = " ".join(word for word in essie.review_processed[(essie.overall == 4)|(essie.overall == 5)].astype(str))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(text)
# Visualize the word cloud
wordcloud.to_image()
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
text = " ".join(word for word in essie.review_processed[(essie.overall == 1)|(essie.overall == 2)].astype(str))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(text)
# Visualize the word cloud
wordcloud.to_image()
from gensim.models.word2vec import Word2Vec
model = Word2Vec(sentences=essie['review_processed'].tolist(),
size=100, sg=1,
min_count=5,window=10,
workers=-1,seed=10,iter=250)
vocab = model.wv.index2word
model.wv.most_similar('disappointed', topn=10)
[('imatation', 0.3182666301727295), ('doesnt', 0.30145254731178284), ('effectively', 0.29990583658218384), ('needs', 0.2948138117790222), ('instead', 0.27057692408561707), ('iridescent', 0.2598556876182556), ('berry', 0.2544700801372528), ('lovers', 0.25341448187828064), ('fitting', 0.2525997757911682), ('seche', 0.24726007878780365)]
from gensim import corpora
dictionary = corpora.Dictionary(essie['review_processed'])
dictionaryDF = pd.DataFrame()
dictionaryDF['id']=dictionary.keys()
dictionaryDF['word']=dictionary.values()
dictionaryDF
id | word | |
---|---|---|
0 | 0 | amazing |
1 | 1 | awesome |
2 | 2 | look |
3 | 3 | neon |
4 | 4 | perfect |
... | ... | ... |
1326 | 1326 | running |
1327 | 1327 | cartel |
1328 | 1328 | visible |
1329 | 1329 | priced |
1330 | 1330 | reasonably |
1331 rows × 2 columns
essie['review_ids']=essie['review_processed'].apply(lambda x:dictionary.doc2bow(x))
from gensim import models
num_topics=5
ldamodel = models.ldamodel.LdaModel(essie.review_ids[(essie.overall == 4)|(essie.overall == 5)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
print(topic)
(0, '0.053*"great" + 0.016*"coats" + 0.015*"summer" + 0.012*"coat" + 0.012*"time" + 0.011*"favorite"') (1, '0.032*"perfect" + 0.031*"pink" + 0.028*"great" + 0.022*"beautiful" + 0.019*"summer" + 0.015*"time"') (2, '0.024*"nice" + 0.019*"perfect" + 0.018*"good" + 0.016*"beautiful" + 0.014*"time" + 0.014*"colors"') (3, '0.033*"pretty" + 0.018*"pink" + 0.018*"favorite" + 0.017*"coats" + 0.017*"great" + 0.015*"colors"') (4, '0.037*"great" + 0.020*"looks" + 0.013*"coat" + 0.010*"skin" + 0.010*"neon" + 0.010*"beautiful"')
from gensim import models
num_topics=5
ldamodel = models.ldamodel.LdaModel(essie.review_ids[(essie.overall == 1)|(essie.overall == 2)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
print(topic)
(0, '0.030*"coat" + 0.028*"coats" + 0.022*"pink" + 0.016*"u" + 0.015*"does" + 0.012*"streaky"') (1, '0.021*"coats" + 0.019*"doesnt" + 0.014*"matte" + 0.014*"bought" + 0.013*"colors" + 0.013*"looks"') (2, '0.022*"colors" + 0.019*"gray" + 0.015*"dries" + 0.015*"looks" + 0.014*"shown" + 0.013*"pretty"') (3, '0.018*"quality" + 0.015*"looks" + 0.015*"coats" + 0.012*"bought" + 0.011*"amazon" + 0.011*"dont"') (4, '0.037*"disappointed" + 0.034*"coats" + 0.013*"good" + 0.012*"formula" + 0.012*"able" + 0.012*"applied"')
opi = all_nails.copy()
ls = []
for i in opi.index:
if "opi" in opi.loc[i,'title_processed']:
ls.append(i)
opi = opi.iloc[ls,:]
opi = opi.reset_index(drop = True)
len(opi)
678
opi.overall.value_counts().sort_index(ascending = False)
5 492 4 82 3 53 2 23 1 28 Name: overall, dtype: int64
import re
from sklearn import feature_extraction
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
def preprocess(text):
text = text.lower() #lowercase
text = re.sub(r'[^\w\s]', '', text) #remove punctuations
text = re.sub(r'\d+', '', text) #remove numbers
text = " ".join(text.split()) #stripWhitespace
text = text.split()
text = [x for x in text if x not in stop_words] #remove stopwords
text = [x for x in text if x not in ["product",'loves','likes','color','just','essie','opi','cnd',
'nail','nails','polish','polishes','really','love','like','im']] #remove task specific stopwords
text = " ".join(text)
# stemmer_ps = PorterStemmer()
# text = [stemmer_ps.stem(word) for word in text.split()] #stemming
# text = " ".join(text)
# lemmatizer = WordNetLemmatizer()
# text = [lemmatizer.lemmatize(word) for word in text.split()] #lemmatization
# text = " ".join(text)
return(text)
opi['review_processed']=opi['reviewText'].apply(lambda x:preprocess(x))
opi['review_processed']=opi['review_processed'].apply(lambda x:x.split())
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
text = " ".join(word for word in opi.review_processed[(opi.overall == 4)|(opi.overall == 5)].astype(str))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(text)
# Visualize the word cloud
wordcloud.to_image()
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
text = " ".join(word for word in opi.review_processed[(opi.overall == 1)|(opi.overall == 2)].astype(str))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(text)
# Visualize the word cloud
wordcloud.to_image()
from gensim.models.word2vec import Word2Vec
model = Word2Vec(sentences=opi['review_processed'].tolist(),
size=100, sg=1,
min_count=5,window=10,
workers=-1,seed=10,iter=250)
vocab = model.wv.index2word
model.wv.most_similar('good', topn=10)
[('gave', 0.3487659990787506), ('pinterst', 0.2854136824607849), ('secure', 0.28510773181915283), ('medium', 0.2783038020133972), ('loved', 0.2766246199607849), ('demure', 0.26437440514564514), ('collections', 0.2505810558795929), ('plum', 0.24891437590122223), ('needed', 0.2451680302619934), ('stain', 0.2412152886390686)]
from gensim import corpora
dictionary = corpora.Dictionary(opi['review_processed'])
dictionaryDF = pd.DataFrame()
dictionaryDF['id']=dictionary.keys()
dictionaryDF['word']=dictionary.values()
dictionaryDF
id | word | |
---|---|---|
0 | 0 | brand |
1 | 1 | bought |
2 | 2 | gift |
3 | 3 | recipient |
4 | 4 | thrilled |
... | ... | ... |
1990 | 1990 | knockwurst |
1991 | 1991 | pinky |
1992 | 1992 | taupe |
1993 | 1993 | dressy |
1994 | 1994 | garish |
1995 rows × 2 columns
opi['review_ids']=opi['review_processed'].apply(lambda x:dictionary.doc2bow(x))
from gensim import models
num_topics=5
ldamodel = models.ldamodel.LdaModel(opi.review_ids[(opi.overall == 4)|(opi.overall == 5)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
print(topic)
(0, '0.019*"great" + 0.017*"beautiful" + 0.012*"perfect" + 0.011*"coat" + 0.010*"coats" + 0.007*"gorgeous"') (1, '0.013*"coat" + 0.012*"great" + 0.012*"use" + 0.009*"base" + 0.009*"colors" + 0.009*"looks"') (2, '0.021*"coat" + 0.013*"base" + 0.011*"beautiful" + 0.009*"nice" + 0.007*"used" + 0.007*"using"') (3, '0.015*"use" + 0.011*"pink" + 0.010*"remover" + 0.010*"great" + 0.010*"coat" + 0.009*"works"') (4, '0.034*"great" + 0.015*"little" + 0.014*"good" + 0.011*"coat" + 0.010*"use" + 0.009*"pretty"')
from gensim import models
num_topics=5
ldamodel = models.ldamodel.LdaModel(opi.review_ids[(opi.overall == 1)|(opi.overall == 2)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
print(topic)
(0, '0.011*"gel" + 0.009*"coat" + 0.009*"day" + 0.008*"stick" + 0.007*"stuff" + 0.007*"brush"') (1, '0.007*"day" + 0.007*"dont" + 0.006*"nude" + 0.006*"waste" + 0.006*"money" + 0.006*"use"') (2, '0.012*"coat" + 0.007*"mess" + 0.006*"time" + 0.006*"dirty" + 0.005*"hands" + 0.005*"sure"') (3, '0.011*"yellow" + 0.010*"look" + 0.006*"didnt" + 0.006*"pink" + 0.005*"white" + 0.005*"time"') (4, '0.008*"coat" + 0.008*"dont" + 0.007*"use" + 0.006*"brand" + 0.006*"didnt" + 0.005*"disappointed"')
cnd_ls = []
for i in beauty.index:
if "nail" in str(beauty.iloc[i]["title"]).lower() or "nails" in str(beauty.iloc[i]["title"]).lower():
if "cnd" in str(beauty.iloc[i]["title"]).lower() or "creative nail" in str(beauty.iloc[i]["title"]).lower():
cnd_ls.append(i)
cnd= beauty.iloc[cnd_ls,:]
len(cnd)
1838
cnd.overall.value_counts().sort_index(ascending = False)
5 1393 4 220 3 121 2 60 1 44 Name: overall, dtype: int64
cnd.head(3)
asin | title | summary | reviewText | overall | reviewerID | reviewerName | reviewTime | unixReviewTime | year | month | review_processed | pos_neg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6889 | B0016LTZD0 | Creative Nail SolarOil, 4 Fluid Ounce | Five Stars | way cheaper to buy the refill! works magic on... | 5 | A22PHKFFVGZ9X6 | lisa_north atlanta | 2015-08-30 | 1440892800 | 2015 | 8 | [way, cheaper, buy, refill, works, magic, beat... | 1 |
9982 | B002K6AHQY | CND Vinylux Weekly Nail Polish, Rock Royalty,... | Lovely! | This is CND's new product. The product itself... | 4 | A2FW71YE37Q2YO | JPS | 2013-05-31 | 1369958400 | 2013 | 5 | [cnds, new, product, product, awesome, color, ... | 1 |
9983 | B002K6AHQY | CND Vinylux Weekly Nail Polish, Rock Royalty,... | Nice color | This is CND's new product. The product itself... | 4 | A2FW71YE37Q2YO | JPS | 2013-05-31 | 1369958400 | 2013 | 5 | [cnds, new, product, product, awesome, color, ... | 1 |
import re
from sklearn import feature_extraction
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
def preprocess(text):
text = text.lower() #lowercase
text = re.sub(r'[^\w\s]', '', text) #remove punctuations
text = re.sub(r'\d+', '', text) #remove numbers
text = " ".join(text.split()) #stripWhitespace
text = text.split()
text = [x for x in text if x not in stop_words] #remove stopwords
text = [x for x in text if x not in ["product",'loves','likes','color','just','essie','opi','cnd',
'nail','nails','polish','polishes','really','love','like','im']] #remove task specific stopwords
text = " ".join(text)
# stemmer_ps = PorterStemmer()
# text = [stemmer_ps.stem(word) for word in text.split()] #stemming
# text = " ".join(text)
# lemmatizer = WordNetLemmatizer()
# text = [lemmatizer.lemmatize(word) for word in text.split()] #lemmatization
# text = " ".join(text)
return(text)
cnd['review_processed']=cnd['reviewText'].apply(lambda x:preprocess(x))
cnd['review_processed']=cnd['review_processed'].apply(lambda x:x.split())
/var/folders/fz/6zhffn0d72s0mwxj9rgr2c700000gn/T/ipykernel_38449/2805646083.py:25: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy cnd['review_processed']=cnd['reviewText'].apply(lambda x:preprocess(x)) /var/folders/fz/6zhffn0d72s0mwxj9rgr2c700000gn/T/ipykernel_38449/2805646083.py:26: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy cnd['review_processed']=cnd['review_processed'].apply(lambda x:x.split())
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
text = " ".join(word for word in cnd.review_processed[(cnd.overall == 4)|(cnd.overall == 5)].astype(str))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(text)
# Visualize the word cloud
wordcloud.to_image()
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
text = " ".join(word for word in cnd.review_processed[(cnd.overall == 1)|(cnd.overall == 2)].astype(str))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(text)
# Visualize the word cloud
wordcloud.to_image()
from gensim.models.word2vec import Word2Vec
model = Word2Vec(sentences=cnd['review_processed'].tolist(),
size=100, sg=1,
min_count=5,window=10,
workers=-1,seed=10,iter=250)
vocab = model.wv.index2word
model.wv.most_similar('disappointed', topn=10)
[('doesnt', 0.30145254731178284), ('needs', 0.2948138117790222), ('clearly', 0.28421464562416077), ('instead', 0.27057692408561707), ('cuticles', 0.26807481050491333), ('mail', 0.2597600817680359), ('st', 0.25243714451789856), ('seller', 0.24061936140060425), ('easily', 0.23203958570957184), ('peel', 0.23144644498825073)]
from gensim import corpora
dictionary = corpora.Dictionary(cnd['review_processed'])
dictionaryDF = pd.DataFrame()
dictionaryDF['id']=dictionary.keys()
dictionaryDF['word']=dictionary.values()
dictionaryDF
id | word | |
---|---|---|
0 | 0 | beat |
1 | 1 | buy |
2 | 2 | cheaper |
3 | 3 | hands |
4 | 4 | magic |
... | ... | ... |
2529 | 2529 | werent |
2530 | 2530 | editional |
2531 | 2531 | sparknlings |
2532 | 2532 | messing |
2533 | 2533 | meh |
2534 rows × 2 columns
cnd['review_ids']=cnd['review_processed'].apply(lambda x:dictionary.doc2bow(x))
/var/folders/fz/6zhffn0d72s0mwxj9rgr2c700000gn/T/ipykernel_38449/120259428.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy cnd['review_ids']=cnd['review_processed'].apply(lambda x:dictionary.doc2bow(x))
from gensim import models
num_topics=5
ldamodel = models.ldamodel.LdaModel(cnd.review_ids[(cnd.overall == 4)|(cnd.overall == 5)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
print(topic)
(0, '0.019*"pretty" + 0.017*"shellac" + 0.015*"good" + 0.015*"great" + 0.011*"use" + 0.011*"coats"') (1, '0.032*"great" + 0.017*"shellac" + 0.014*"colors" + 0.012*"weeks" + 0.011*"light" + 0.011*"use"') (2, '0.025*"great" + 0.022*"shellac" + 0.012*"light" + 0.011*"wear" + 0.011*"lasts" + 0.010*"week"') (3, '0.021*"coat" + 0.012*"colors" + 0.012*"shellac" + 0.011*"nice" + 0.009*"coats" + 0.009*"use"') (4, '0.035*"great" + 0.028*"shellac" + 0.017*"colors" + 0.013*"red" + 0.012*"dark" + 0.011*"look"')
from gensim import models
num_topics=5
ldamodel = models.ldamodel.LdaModel(cnd.review_ids[(cnd.overall == 1)|(cnd.overall == 2)], num_topics = num_topics, id2word=dictionary, passes=1, random_state=100)
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
print(topic)
(0, '0.009*"shellac" + 0.008*"good" + 0.007*"use" + 0.006*"looking" + 0.006*"pretty" + 0.006*"finish"') (1, '0.012*"coats" + 0.009*"shellac" + 0.006*"colors" + 0.006*"days" + 0.006*"pink" + 0.006*"use"') (2, '0.012*"light" + 0.006*"idea" + 0.005*"bad" + 0.005*"pretty" + 0.005*"manicure" + 0.005*"way"') (3, '0.015*"pink" + 0.010*"money" + 0.009*"return" + 0.009*"did" + 0.008*"brown" + 0.008*"dont"') (4, '0.008*"glitter" + 0.007*"peels" + 0.007*"easily" + 0.007*"shellac" + 0.006*"colors" + 0.006*"pink"')